/*	This program creates a working dataset for main analysis sample centered
at the predicted start of the benefit calculation window, from ages 45 
up to 8 years after the start of the BCW.*/


***** Set directories 
local dir_raw 		"~/Dropbox/Retirement gaming/raw"
local dir_clean 	"~/Dropbox/Retirement gaming/clean"
local dir_output 	"~/Dropbox/Retirement gaming/output/dataverse"
local dir_temp		"~/Dropbox/Retirement gaming/clean/temp" // folder to save temporary files that can be erase after running the do-file

	
* Get sample ids from main sample
use  "`dir_clean'/mainsample_age50analysis.dta", clear
sum W
global Wmax = r(max)
global Wmin = r(min)
keep i *_1stobs withempl tenure1yrs_at49 firmid_at49
duplicates drop 
save  "`dir_temp'/mainsamplei.dta", replace

* Get full data for the sample
use "`dir_clean'/admindata.dta", clear
merge m:1 i using "`dir_temp'/mainsamplei.dta"
keep if _merge==3
drop _m

**** SAMPLE RESTRICTIONS ****
****** Keep industria & comercio
drop aportaci_min 
drop if aportaci!=1 // Caja de aportacion; 1=industria y comercio, 
		//2=civil (org publico), 3=rural, 4=construccion, 5=notarial, 6=bancaria, 48=servicio domestico
		
*** Identify and drop people in early retirement regimes
drop if vf_min==103 // these are people working while already retired
drop if vf_min==97 | vf_min==98 // people reported with no service to the firm


*** Cohort
g cohort=yofd(Fnac)

**** MERGE PREDICTED BCW ****
merge m:1 i using "`dir_clean'/mainsample_predictions.dta"
rename _merge merged_predictedbcw



** MONTH AND AGE CENTERED AROUND AGE 50 **
g birth_month=mofd(Fnac) // ADDED
g refbday_month=birth_month+(12*50)
sum refbday_month

* age in months centered at 50's birthday
g agemonths_centered = t-refbday_month // this is number of months after 50

* age in years centered around age 50 
g age_centered=floor(agemonths_centered/12)

*Age 
g agemonths=agemonths_centered+12*50
g age=agemonths/12
g agedisc=floor(age)
	
************************************************************
*** MONTH OF OBSERVATION CENTERED AROUND *PREDICTED* BCW ***
************************************************************
foreach X in medbcw expbcw {
	g refbday_month_`X'=refbday_month+round(12*`X') if `X'!=.
	sum refbday_month_`X'
	format refbday_month_`X' %tm
	* age in months centered at 50's birthday
	g timemonths_bcw_`X' = t-refbday_month_`X' // this is number of months after ref age birthday
	* age in years centered around predicted start BCW
	g time_bcw_`X'=.
	forvalues y = -5(1)7 {
		local min=`y'*12
		local max=(`y'+1)*12
		replace time_bcw_`X'=`y' if timemonths_bcw_`X'>=`min' & timemonths_bcw_`X'<`max'
	}
	replace time_bcw_`X'=-6 if time_bcw_`X'==. & timemonths_bcw_`X'<-5*12 & age_centered>=-5 // lump all obs more than 5 years before start of BCW that belong to our main sample
}

	
*SELF-EMPLOYED
g self_empl=status_1==1 

*EMPLOYED
g empl=status_3==1

* KEEP SELF-EMPLOYED AND EMPLOYED 
keep if self_empl==1 | empl==1

* Drop if no earnings reported
drop if W==.
drop if W==0 // this drops periods with paid leave also


*  Keep only salaried work observations 
g sample_salary = tipREM_min==1
keep if sample_salary==1

*** Drop earnings outliers 
sum W, det 
replace W=. if W< $Wmin
replace W=. if W> $Wmax
drop if W==.

*** Keep only main job and drop duplicates
duplicates tag i t, g(tag)
//  prioritize (among duplicates) jobs with salary pay over jobs with daily or commission pay
tab tag sample_salary
bys i t: egen maxsal=max(sample_salary) 
drop if tag>0 & maxsal==1 & sample_salary==0
drop tag maxsal
duplicates tag i t, g(tag)
bysort i t: egen maxrem=max(W)
drop if tag>0 & W<maxrem
drop tag
drop maxrem
duplicates tag i t, g(tag)
tab tag
sort i t
duplicates drop i t W, force
drop tag  

*** firm size
label define size_cat 0 "Micro less than 5 " 1 "Micro 5-9" 2 "Small 10-19" 3 "Small 20-49" 4 "Medium 50-249" 5 "Large 250 plus"
foreach X in ndep {
	g 		`X'_cat=0 if `X' <5 // micro 1 less than 5
	replace `X'_cat=1 if `X'>=5 & `X'<10 // micro 2
	replace `X'_cat=2 if `X'>=10 & `X'<20 // small 1
	replace `X'_cat=3 if `X'>=20 & `X'<50 // small 2
	replace `X'_cat=4 if `X'>=50 & `X'<250 // medium
	replace `X'_cat=5 if `X'>=250  // large
	label values `X'_cat size_cat
}

cap drop year
g year=yofd(dofm(t))


*** 2-digit CIIU
cap drop aux
tostring ciiu, g(aux)
g ciiu2=substr(aux,1,2)
destring ciiu2, replace
drop aux

* Save data
save "`dir_temp'/datawithpredbcw.dta", replace

*Add MCB
tempfile time
use "`dir_temp'/datawithpredbcw.dta", clear
keep t 
duplicates drop 
save `time'

import delimited "`dir_raw'/FICTO UNIPERSONALES.csv", clear rowrange(5) varnames(5)
foreach var in valorbfc mgravado aportebps{
	destring `var', replace ignore(",")
}	
g t=mofd(date(fvigencia, "MY",2019))
format t %tm
sort t
merge 1:1 t using `time'
sort t
tset t
foreach var in valorbfc mgravado aportebps{
	replace `var' = l.`var' if `var'==.
}	
sort t
foreach var in valorbfc mgravado aportebps{
	replace `var' = `var'[1] if _n==2
}	
drop if _m==1
drop _m fvigencia

merge 1:m t using "`dir_temp'/datawithpredbcw.dta"
drop _m

g rficto = mgravado/(1000*ipc)
label var rficto "Min Contribution Base in 1000 Pesos of 2015"

save "`dir_temp'/datawithpredbcw.dta", replace



**************************************************************************
**************************************************************************
* GENERATE DATASETS CENTERED AROUND MEDIAN AND EXPECTATION OF RETIREMENT *
**************************************************************************
**************************************************************************

foreach X in medbcw expbcw {

	local opendataname "datawithpredbcw.dta"
	local savedataname "workingdata_`X'.dta"

	use "`dir_temp'/`opendataname'", clear
	
	*** Keep observations in the relevant interval around PREDICTED start BCWC
	rename time_bcw_`X' time_bcw
	rename timemonths_bcw_`X' timemonths_bcw
		
	keep if  time_bcw!=. 
	
	** weights based on predicted probability of retirement, for each (actual) age
	g Fbcw=.
	forvalues a=0/6 {
		replace Fbcw=F`a'oprobit if agedisc==50+`a'
	}
	replace Fbcw=1 if agedisc>56 // model is bounded at age 57
	g pbcw=1 if time_bcw<0
	replace pbcw=Fbcw if time_bcw>=0

	table agedisc, contents(mean pbcw median pbcw min pbcw max pbcw)
	table time_bcw, contents(mean pbcw median pbcw min pbcw max pbcw)

	cap drop aux
	g aux=agedisc if time_bcw==0
	bys i: egen age_startbcw=max(aux)
	drop aux

	*****************************************************************************
	*** SAMPLE only observations that correspond to reported self_employment 
	*****************************************************************************

	* Determine each person's max number of months in sample (depending on cohort)
	local firstyrfull=1996-(45)
	local lastyrfull=2016-(57)+1

	g max_months=156
	replace max_months=156 - ( tm(`firstyrfull'm4)-birth_month  ) if birth_month < tm(`firstyrfull'm4)
	replace max_months=156 - ( birth_month - tm(`lastyrfull'm3)) if birth_month > tm(`lastyrfull'm3)
	tab max_months

	
	* Select sample
	foreach X in empl self_empl {
		* individuals observed employed/self_employed for at least 6 months overall
		bys i : egen count_m`X' = total(`X') 
		g prop_m`X'= count_m`X'/max_months 
		*Sample self_empl or empl
		g sample_`X'=`X'==1 & (count_m`X'>=6) // observations of self/empl with person observed in that state at least mt months in the whole period
		* Create isample to identify *individuals* who have observations in each sample
		bys i: egen isample_`X' = max(sample_`X')
	} // sample_X indicates observations of self/empl for people with that condition at least 6 months at age 50

	* DROP from sample of empl those who ever have self_employment
	foreach X in empl self_empl {
		bysort i: egen `X'_any2=max(`X')
	}

	tab isample_empl sample_empl, m

	sort i t
	bysort i: g order=_n
	count if isample_empl==1 & order==1
	count if isample_self_empl==1 & order==1

	cap drop aux
	g aux=1
	bys i : egen count_anyobs = total(aux) 

	
	********* 


	* Keep relevant variables
	keep i t j year age* *empl* W* remC1_sum remC2_sum remC3_sum amt_* ben ciiu* Tipocontr ipc ndep *ficto* ndep_cat *sample* birth_month *count* max_months cohort Fing Fegr hrsmonth trem_*_max status_*_max pbcw *oprobit age_startbcw *_1stobs tenure1yrs_at49 withempl firmid_at49 time_bcw timemonths_bcw

	save "`dir_temp'/`savedataname'", replace

}



********************************************************************************
**** PREPARE DATA FOR EVENT STUDIES 							
********************************************************************************

foreach X in medbcw expbcw {

	local opendataname "workingdata_`X'"
	local savedataname "mainsample_`X'"

	use "`dir_temp'/`opendataname'", clear

	sort i t
	bysort i: g order=_n


	bysort i: egen minage=min(age)
	bysort i: egen maxage=max(age)
	sum age minage maxage 

	gen 	ciiu1 = 1 if ciiu2<10											// Agriculture and mining	
	replace ciiu1 = 2 if (ciiu2>=10 & ciiu2<=33) | ciiu2==95 				// Manufacturing 
	replace ciiu1 = 3 if ciiu2>=35 & ciiu2<=39 								// Energy and waste disposal
	replace ciiu1 = 4 if ciiu2>=41 & ciiu2<=43  							// Construction
	replace ciiu1 = 5 if (ciiu2>=45 & ciiu2<=47) | (ciiu2>=55 & ciiu2<=56)	// wholesale and retail, restaurants, hotels
	replace ciiu1 = 6 if (ciiu2>=49 & ciiu2<=53) | ciiu2==61				// transport and communications 
	replace ciiu1 = 7 if (ciiu2>=62 & ciiu2<=82) | ciiu2==96				// services
	replace ciiu1 = 8 if (ciiu2>=84 & ciiu2<=94) | (ciiu2>=58 & ciiu2<=60)| ciiu2==97 // public admin, social and domestic services

	g manufacturing		= ciiu1==2
	g retailhospitality	= ciiu1==5 
	g transportenergy	= ciiu1==6 | ciiu1==3
	g services 			= ciiu1==7 | ciiu1==8 | ciiu1==4 // includes construction not in construction pension system

	foreach var in manufacturing retailhospitality transportenergy services {
		replace `var'=. if ciiu2==.
	}
	label var manufacturing 	"Manufacturing"
	label var retailhospitality "Retail, Restaurants, Hotels"
	label var transportenergy 	"Transport, Communications, Energy"
	label var services 			"Services, Other"

	g sector=1 if manufacturing==1
	replace sector=2 if retailhospitality==1
	replace sector=3 if transportenergy==1
	replace sector=4 if services==1

	g jobstart=mofd(Fing)
	format jobstart %tm
	g tenure=t-jobstart
	sum tenure if self_empl==1, det
	sum tenure if empl==1, det
	g tenure_1yrs=tenure>=(12*1) if tenure!=.

	
	*JOB CHANGES
	xtset i t
	sort i t
	cap drop auxj
	bys i: gen auxj=l1.j
	sort i t
	replace auxj=j[_n-1] if auxj==. & i[_n]==i[_n-1]
	g jobchange=auxj!=j if auxj!=. 
	bys i agedisc: egen jobchange_atage=max(jobchange) 
	replace jobchange_atage=. if jobchange==.
	bys i: egen anyjobchange=max(jobchange) 


	********************************************************************************
	**** PREPARE DATA FOR EVENT STUDIES 							
	********************************************************************************

	* 0-3 post bcw dummy
	g post03=time_bcw>=0 & time_bcw<4
	label var post03 "0-3 yrs. post start BCW"
	* 4+ post bcw dummy
	g post4=time_bcw>=4
	label var post4 "4+ yrs. post start BCW"
	* 2+ pre bcw 
	g pre2=time_bcw<=-2
	label var pre2 "2+ yrs. pre start BCW"
	* 2-5 pre bcw 
	g pre25=time_bcw>=-5 & time_bcw<=-2
	label var pre25 "2-5 yrs. pre start BCW"
	* 6+ pre bcw 
	g pre6=time_bcw<=-6 
	label var pre6 "6+ yrs. pre start BCW"	
	* Post start BCW
	g post0=time_bcw>=0 
	label var post0 "Post start BCW"
	* Pre 46-48 (to drop two dummies)
	g pre24=time_bcw>-5 & time_bcw<=-2
	label var pre24 "2-4 yrs. pre start BCW"

	*Interactions age trend and shifts
	*cap drop age // now renaming treatment var
	g bcwttrend=timemonths_bcw/12
	label var bcwttrend "Event-time trend"
	g bcwttrend2=bcwttrend^2
	label var bcwttrend2 "Event-time trend squared"
	gen bcwttrend_post0 = bcwttrend*post0
	label var bcwttrend_post0 "Post BCW x event-time trend"
	g bcwttrend2_post0 = bcwttrend2 * post0
	label var bcwttrend2_post0 "Post BCW x trend squared"
	gen bcwttrend_post4 = bcwttrend*post4
	label var bcwttrend_post4 "4+ yrs. post BCW x event-time trend"
	gen bcwttrend_pre6 = bcwttrend*pre6
	label var bcwttrend_pre6 "6+ yrs. pre BCW x event-time trend"

	* Reports MCB
	g Wround=round(W,.01)
	g rfictoround=round(rficto,.01)
	sum Wround W rfictoround rficto
	g reports_ficto= Wround==rfictoround if self_empl==1
	sum reports_ficto if self_empl==1

	g reports_uptoficto= Wround<=rfictoround if self_empl==1
	sum reports_uptoficto if self_empl==1

	** NEW RR1 FIRM SIZE CATEGORIES **
	replace ndep_cat=ndep_cat+1
	replace ndep_cat=0 if ndep==0
	cap label drop ndep_cat
	label define ndep_cat 0 "no employees" 1 "1-4" 2 "5-9" 3 "10-19" 4 "20-49" 5 "50-249" 6"250 plus" 
	label values ndep_cat ndep_cat

	*Dummies for descriptives
	g noempl	=ndep_cat==0
	g micro		=ndep_cat==1 
	g micro2	=ndep_cat==2
	g small		=ndep_cat<=2
	g larger	=ndep_cat>=3 & ndep_cat<.

	label var noempl "No employees"
	label var micro "Firm size $<5$ workers"
	label var micro2 "Firm size 5-9 workers"
	label var small	"Firm size 10-49 workers"
	label var larger "Firm size $\geq$10 workers"


	*Wages relative to self-employed minimum
	g Wficto=W/rficto
	label var Wficto "Earnings/MCB"
			
	label var year "Year"
	label var cohort "Birth cohort"
	label var age "Age"
	label var prop_mempl "Prop. time employed" 
	label var prop_mself_empl "Prop. time self employed" 
	label var W "Reported earnings (1,000 UYP)"

	* GROUPS OF COHORTS (TRANSITION REGIME)
	g oldsystem=birth_month<tm(1936m4) 
	g transition=birth_month>=tm(1936m4) & birth_month<tm(1956m4) 
	foreach X in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6 {
	g `X'_transition= `X'*transition
	}
	label var transition				"Transition"
	label var post03_transition 		"Transition x 0-3 post BCW"
	label var post4_transition 			"Transition x 4+ post BCW"
	label var pre2_transition 			"Transition x 2+ pre BCW"
	label var pre25_transition 			"Transition x 2-5 pre BCW"
	label var pre24_transition 			"Transition x 2-4 pre BCW"
	label var pre6_transition 			"Transition x 6+ pre BCW"
	label var post0_transition			"Transition x post BCW"
	label var bcwttrend_transition	 	"Transition x event-time trend"
	label var bcwttrend_post0_transition "Transition x post BCW x trend"
	label var bcwttrend_pre6_transition "Transition x 6+ pre BCW x trend"

	* WITH EMPLOYEES WHEN FIRST OBSERVED
	foreach X in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6 {
	g `X'_withempl= `X'*withempl
	}
	label var withempl					"Employer"
	label var post03_withempl 			"Employer x 0-3 post BCW"
	label var post4_withempl 			"Employer x 4+ post BCW"
	label var pre2_withempl 			"Employer x 2+ pre BCW"
	label var pre25_withempl 			"Employer x 2-5 pre BCW"
	label var pre24_withempl 			"Employer x 2-4 pre BCW"
	label var pre6_withempl 			"Employer x 6+ pre BCW"
	label var post0_withempl			"Employer x post BCW"
	label var bcwttrend_withempl	 	"Employer x event-time trend"
	label var bcwttrend_post0_withempl 	"Employer x post BCW x trend"
	label var bcwttrend_pre6_withempl 	"Employer x 6+ pre BCW x trend"

	* INTERACTIONS FOR DID
	foreach var in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6  {
	g `var'_self_empl=`var' * self_empl
	}
	label var post03_self_empl 			"Self-employed x 0-3 post BCW"
	label var post4_self_empl 			"Self-employed x 4+ post BCW"
	label var pre2_self_empl 			"Self-employed x 2+ pre BCW"
	label var pre25_self_empl 			"Self-employed x 2-5 pre BCW"
	label var pre24_self_empl 			"Self-employed x 2-4 pre BCW"
	label var pre6_self_empl 			"Self-employed x 6+ pre BCW"
	label var post0_self_empl			"Self-employed x post BCW"
	label var bcwttrend_self_empl	 	"Self-employed x event-time trend"
	label var bcwttrend_post0_self_empl "Self-employed x post BCW x trend"
	label var bcwttrend_pre6_self_empl 	"Self-employed x 6+ pre BCW x trend"


	* LARGE SIZE FIRM (FOR EMPLOYEES)
	foreach X in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6  {
	g `X'_larger= `X'*large_1stobs
	}
	label var post03_larger 			"Large firm x 0-3 post BCW"
	label var post4_larger 				"Large firm x 4+ post BCW"
	label var pre2_larger 				"Large firm x 2+ pre BCW"
	label var pre25_larger 				"Large firm x 2-5 pre BCW"
	label var pre24_larger 				"Large firm x 2-4 pre BCW"
	label var pre6_larger 				"Large firm x 6+ pre BCW"
	label var post0_larger				"Large firm x post BCW"
	label var bcwttrend_larger	 		"Large firm x event-time trend"
	label var bcwttrend_post0_larger 	"Large firm x post BCW x trend"
	label var bcwttrend_pre6_larger 	"Large firm x 6+ pre BCW x trend"

	* SMALL SIZE FIRM (FOR EMPLOYEES)
	foreach X in bcwttrend post03 post4 pre2 pre25 pre24 pre6 post0 bcwttrend_post0 bcwttrend_pre6 {
	g `X'_small= `X'*small_1stobs
	}
	label var post03_small 				"Small firm x 0-3 post BCW"
	label var post4_small 				"Small firm x 4+ post BCW"
	label var pre2_small 				"Small firm x 2+ pre BCW"
	label var pre25_small 				"Small firm x 2-5 pre BCW"
	label var pre24_small 				"Small firm x 2-4 pre BCW"
	label var pre6_small 				"Small firm x 6+ pre BCW"
	label var post0_small				"Small firm x post BCW"
	label var bcwttrend_small	 		"Small firm x event-time trend"
	label var bcwttrend_post0_small 	"Small firm x post BCW x trend"
	label var bcwttrend_pre6_small 		"Small firm x 6+ pre BCW x trend"

	* MCB when first observed
	label var ficto_1stobs "Reported MCB 1st obs"

	*HOURS AND WAGES

	g othpay=trem_2_max==1|trem_3_max==1|trem_4_max==1|trem_5_max==1

	sum hrsmonth, det
	replace hrsmonth=. if hrsmonth<r(p5)

	g wagephr= W/hrsmonth if hrsmonth>0

	*STAYERS IN FIRM
	g match_firm49 = empl==1 & j==firmid_at49
	
	save "`dir_clean'/`savedataname'", replace

	use  "`dir_clean'/mainsample_age50analysis.dta", clear
	keep i sample_self_empl 
	duplicates drop
	merge 1:m i using "`dir_clean'/`savedataname'"
	drop _m
	* Reclassify individuals in the sample who switch
	replace empl=0 if sample_self_empl==1 & self_empl==0
	replace self_empl=1 if sample_self_empl==1 & self_empl==0
	replace empl=1 if sample_self_empl==0 & self_empl==1
	replace self_empl=0 if sample_self_empl==0 & self_empl==1

	
	save "`dir_clean'/`savedataname'", replace
}

exit

